#Note: This file takes several hours on a server
rm(list = ls())
gc()

library(lubridate)
library(foreign)
library(ggplot2)
library(ggthemes)
library(estimatr)
library(tidyverse)
library(mice)

df = readRDS("data/svy.rds")

# detecting columns with NAs ----------------------------------------------
names_na = NULL
for(i in 1:ncol(df)){
  if(sum(is.na(df[,i]))>0){
    names_na = c(names_na, names(df)[i])
  }
}
names_na

# Making sure there are no -98/-99 values -------------------------------------------
for(i in 1:ncol(df)){
  x = ifelse(sum(str_detect(df[,i], "-97|-98|-99|-100"))>0, i, 0)
  if(x > 0 & !is.na(x)){
    print(x)
  }
}

#Exploring missing data -----------------------------------------------
p_missing = unlist(lapply(df, function(x) sum(is.na(x))))/nrow(df)
sort(p_missing[p_missing > 0], decreasing = TRUE)

# running mice code -------------------------------------------------------
imp = parlmice(df, maxit=0, cluster.seed = 35734) # We run the mice code with 0 iterations to get predictor matrix and method
predM = imp$predictorMatrix
meth = imp$method

# Setting values of variables we'd like to leave out to 0 in the predictor matrix
predM[, c("weights", "response_num")]=0

head(predM)

#Examining character columns
col_type = NULL
for(i in 1:ncol(df)){
  col_type[i] = (paste0(names(df)[i], " - ", class(df[,i])))
}

col_type[str_detect(col_type, "character")]
col_type[str_detect(col_type, "factor")]

#This produces 10 imputations (1 on each core) with 30 max iterations for each
imp2 <- parlmice(df, maxit = 30, m = 1,
             predictorMatrix = predM, 
             nnet.MaxNWts = 3000,
             method = "rf", 
             print =  TRUE,
             n.imp.core = 1,
             cluster.seed = 978623, cl.type = "FORK", 
             n.core = 10)

imp2 %>% saveRDS("data/imputed_data.rds")
imputed_data_long = mice::complete(imp2, action="long", include = TRUE)
imputed_data_long %>% saveRDS("data/imputed_long.rds")
